Problem description can be found here.

Preliminaries

First load the necessary packages for this exercise.

# Load default settings for R Markdown -- see file for more details
source("shared/defaults.R")

options(stringsAsFactors = FALSE)
packages <- c("dplyr","ggplot2","randomForest","tidyr","leaflet","purrr","grDevices")
load_or_install.packages(packages)

data_dir <- "data/"

si <- sessionInfo()
base_pkg_str <- paste0("Base Packages: ",paste(si[["basePkgs"]], collapse=", "))
attached_pkg_str <- paste0("Attached Packages: ",paste(names(si[["otherPkgs"]]), collapse=", "))
cat(paste0(base_pkg_str,"\n",attached_pkg_str))
## Base Packages: stats, graphics, grDevices, utils, datasets, methods, base
## Attached Packages: purrr, leaflet, tidyr, randomForest, ggplot2, dplyr, knitr

Exploration

About the Data

We kick off by exploring the data that was provided:

data_overview <- function(data,
                          null_fn = function(cname) { paste0(cname," == '' | is.na(",cname,")")}) {

                  cols_summary <- data.frame(ColumnNames = colnames(data))
                  cols_summary$Type <- lapply(training_set, class) %>%
                                       toupper()
                  cols_summary$Examples <- lapply(cols_summary$ColumnNames,
                                                  function(cname) {
                                                    training_set %>%
                                                    filter_(paste0("!(",null_fn(cname),")")) %>%
                                                    `[[`(cname) %>%
                                                    unique() -> filtered_set
                                                    filtered_set[1:min(5, length(filtered_set))] %>%
                                                    paste(collapse=' | ')
                                                  })
                  cols_summary$EmptyValues <- lapply(cols_summary$ColumnNames,
                                                     function(cname) {
                                                        training_set %>%
                                                        filter_(null_fn(cname)) %>%
                                                        nrow()
                                                     })
                  cols_summary$PctFilled <- lapply(cols_summary$EmptyValues,
                                                   function(x) {
                                                      ((nrow(training_set) - x) / nrow(training_set)) %>%
                                                      `*`(100) %>% round(0) %>%
                                                      paste0("%")
                                                   })

                  select(cols_summary, ColumnNames, Type, Examples, PctFilled)
                }


training_set <- read.csv(paste0(data_dir, "train.csv"))

cols_summary <- data_overview(training_set)

# Parse Output
cols_sum_pp <- cols_summary
colnames(cols_sum_pp) <- sapply(colnames(cols_sum_pp),
                                function(.) { gsub('([[:upper:]])', ' \\1', .) })

knitr::kable(cols_sum_pp,format="html")
Column Names Type Examples Pct Filled
PassengerId INTEGER 1 | 2 | 3 | 4 | 5 100%
Survived INTEGER 0 | 1 100%
Pclass INTEGER 3 | 1 | 2 100%
Name CHARACTER Braund, Mr. Owen Harris | Cumings, Mrs. John Bradley (Florence Briggs Thayer) | Heikkinen, Miss. Laina | Futrelle, Mrs. Jacques Heath (Lily May Peel) | Allen, Mr. William Henry 100%
Sex CHARACTER male | female 100%
Age NUMERIC 22 | 38 | 26 | 35 | 54 80%
SibSp INTEGER 1 | 0 | 3 | 4 | 2 100%
Parch INTEGER 0 | 1 | 2 | 5 | 3 100%
Ticket CHARACTER A/5 21171 | PC 17599 | STON/O2. 3101282 | 113803 | 373450 100%
Fare NUMERIC 7.25 | 71.2833 | 7.925 | 53.1 | 8.05 100%
Cabin CHARACTER C85 | C123 | E46 | G6 | C103 23%
Embarked CHARACTER S | C | Q 100%


Based on the summary above, notice the following:

  • Names are aggregated in the following format: Last_Name, Title First_Name.
  • There are different types of Ticket formats.
  • Age has missing data to be populated.
  • Cabin data is sparse, and hence we may want to exclude this for the prediction model.

Preliminary Insights

On Income

Hypothesis 1: Higher income individuals are more likely to survive.

High-income individuals could pay more for seats in the lifeboats, and hence escape the disaster.

income_set <- training_set %>%
              group_by(Pclass) %>%
              summarise(CohortSize = n(),
                        SurvivalRate = sum(Survived)/n())

income_plot <-  ggplot(income_set, aes(x=SurvivalRate,
                                      y = 0,
                                      color=as.factor(Pclass),
                                      size=CohortSize)) +
                theme_lk() +
                theme(plot.margin = unit(c(0,0,0,-40),'pt')) +
                # Legends
                theme(
                  legend.position = c(0.5,0.2),
                  legend.box.just = c(0.5,0.5),
                  legend.justification = c(0.5,0.5)
                ) +
                scale_color_manual(name = "Passenger Class",
                                  labels = c("1"="1 (High-Income)","2"="2 (Medium-Income)","3"="3 (Low-Income)"),
                                  values = c("1"=as.character(get_color("green")),
                                             "2"=as.character(get_color("yellow")),
                                             "3"=as.character(get_color("red"))),
                                  guide = guide_legend(order = 1,
                                                       direction = "vertical",
                                                       override.aes=list(size=5))) +
               scale_size_continuous(name = "Cohort Size",
                                     range = c(5,20),
                                     guide = guide_legend(order = 2)) +
               # X Axis
                theme(
                  axis.line.x = element_line(colour=NA),
                  axis.ticks.x = element_line(colour=NA),
                  axis.title.x = element_text(colour=NA),
                  axis.text.x = element_text(colour=NA)
                ) +
               geom_segment(aes(x=0.2, xend=0.7, y=0, yend=0),
                            size = 0.5,
                            color=ltxt_color,
                            arrow = arrow(length = unit(10,"pt"),
                                          type = "closed")) +
               geom_text(label = "Survival Likelihood",
                         x = 0.7,
                         y = 0.02,
                         family = def_font,
                         color = ltxt_color,
                         size = 5,
                         hjust = 1) +
               # Y Axis
                theme(
                  axis.line.y = element_line(colour=NA),
                  axis.ticks.y = element_line(colour=NA),
                  axis.title.y = element_text(colour=NA),
                  axis.text.y = element_text(colour=NA)
                ) +
               scale_y_continuous(limits=c(-0.12, 0.05)) +
               # Elements
                geom_point(alpha=0.5) +
                geom_text(aes(y = -0.03, label=paste0(round(SurvivalRate*100,0),"%")),
                          size=4,
                          color=txt_color,
                          family=def_font)

income_plot

Using the passenger class as a proxy for income, we find this claim to be TRUE.

appendFarePUnit <- function(dt) { dt %>%  mutate(Fare_P_Unit = Fare / (1 + SibSp + Parch)) }

fares_set <- appendFarePUnit(training_set) %>%
             select(Fare_P_Unit, Pclass, Survived)

rows_p_fare_bin <- 80
fares_pdata <- fares_set %>%
               arrange(Fare_P_Unit) %>%
               mutate(Group = as.integer((row_number() - 1) / rows_p_fare_bin)+1) %>%
               group_by(Group) %>%
               summarise(FareMax = min(Fare_P_Unit),
                         Survived = sum(Survived),
                         CohortSize = n()) %>%
               mutate(SurvivalRate = Survived / CohortSize)
fares_pdata <- rbind(fares_pdata, c(0,0,0,0,0), c(max(fares_pdata$Group)+1,max(fares_set$Fare_P_Unit),0,0,0)) %>%
               arrange(Group)

fares_plot <- ggplot(fares_pdata, aes(x = FareMax,
                                      y = SurvivalRate)) +
              theme_lk() +
              xlab("Fares / Head") +
              scale_x_continuous(labels = scales::dollar) +
              ylab("Survival Likelihood") +
              scale_y_continuous(labels = scales::percent) +
              geom_step(color = get_color(1))

fares_plot

We also noticed this phenomenon in fares, where the higher amount an individual paid for the fares, the more likely he/she will survive the crash.

On Title

Hypothesis 2: Gender information is embedded under the title of the individual.

Most titles are provided based on the gender on the individual, hence we should expect the hypothesis to hold true.

prefix <- function(x) { unlist(strsplit(unlist(strsplit(x, ", "))[2],"\\. "))[1] }

gender_set <- training_set %>%
              mutate(Prefix = as.character(lapply(Name, prefix))) %>%
              select(Prefix, Sex)
prefix_totals <- gender_set %>%
                 group_by(Prefix) %>%
                 summarise(total = n())
gender_sex_totals <- gender_set %>%
                     group_by(Prefix, Sex) %>%
                     summarise(val=n()) %>%
                     left_join(prefix_totals, by=c("Prefix")) %>%
                     mutate(
                       pct = ifelse(Sex == "female",1.,-1.) * val / total
                     ) %>%
                     arrange(desc(val))
gender_sex_totals$Prefix <- factor(gender_sex_totals$Prefix, levels=unique(gender_sex_totals$Prefix))

gender_plot <- ggplot(gender_sex_totals, aes(x = Prefix,
                                             y = Sex,
                                             fill = Sex,
                                             alpha = val)) +
               theme_lk() +
               scale_fill_manual(values=c("female"=as.character(get_color("red")),
                                           "male"=as.character(get_color("blue"))),
                                    guide = "none") +
               scale_alpha_continuous(limits = c(0,25),
                                       range = c(0.3,1),
                                       name = "Cohort Size",
                                       guide = guide_legend(
                                         nrow = 1,
                                         override.aes=list(fill=ltxt_color)
                                       )) +
                xlab("Title") +
                scale_x_discrete(expand = c(0, 0)) +
                scale_y_discrete(expand = c(0, 0),
                                 labels = c("female"="F","male"="M")) +
                ylab("Gender") +
                geom_tile(colour = bg_color, size = 5)

gender_plot

As can be seen, almost every title has a mapping to the gender of the individual, suggesting that every title is associated with a gender. The only notable exception is “Dr”, which is gender neutral. Hence, the hypothesis is TRUE.

Hypothesis 3: Females and more esteemed individuals are more likely to survive than males.

Prior to this analysis, we will group those individuals without “Mr”,“Mrs”,“Miss”, or “Master” title as “Rare title” individuals. These individuals, along with females, would have higher priority in onboarding the lifeboats, hence we assume that they are more likely to survive the crash.

appendTitle <- function(data) {
  data %>%
  mutate(Title = as.character(lapply(Name, prefix))) %>%
  mutate(Title = ifelse(Title %in% c("Mr","Mrs","Miss","Master"), Title,
                        ifelse(Sex == "male","Rare Title / M", "Rare Title / F")))
}

title_set <- appendTitle(training_set) %>%
             select(Title, Sex, Survived) %>%
             group_by(Title, Sex) %>%
             summarise(survive = sum(Survived) / n()) %>%
             arrange(desc(survive))

title_set$Title <- factor(title_set$Title, levels=c(title_set$Title))

title_plot <- ggplot(title_set, aes(x=Title,
                                    y=survive,
                                    fill=Sex)) +
              theme_lk() +
              scale_fill_manual(name = "Gender",
                                  label = c("male"="Male", "female"="Female"),
                                  values = c("male"= as.character(get_color("blue")),
                                             "female" = as.character(get_color("red"))),
                                   guide = guide_legend()) +

              xlab("Title") +
              ylab("Survival Likelihood") +
              scale_y_continuous(labels = scales::percent,
                                 # Make Sure Bar and X-Axis Stick Together
                                 expand = c(0,0)) +
              geom_bar(stat="identity",
                       width = .5) +
              geom_text(aes(y = survive - 0.08,
                            label=paste0(round(survive*100),"%")),
                        color = "#FFFFFF",
                        family=def_font,
                        size=4)

title_plot

As expected, females are more likely to survive than males, while “esteemed” titles increase the survival likelihood of both genders. Hence, the hypothesis is TRUE.

On Age

Hypothesis 4: Individuals who are very young and very old are more likely to survive.
age_set <- training_set %>%
           select(Age, Survived) %>%
           filter(Age > 0)

rows_p_age_bin <- 50
age_pdata <- age_set %>%
                 arrange(Age) %>%
                 mutate(Group = as.integer((row_number() - 1)/rows_p_age_bin) + 1) %>%
                 group_by(Group) %>%
                 summarise(AgeMin = min(Age),
                           Survived = sum(Survived),
                           CohortSize = n(),
                           SurviveL = Survived/CohortSize)
age_pdata <- rbind(age_pdata,
                   c(0,0,0,0,0),
                   c(max(age_pdata$Group)+1,max(age_set$Age),0,0,0)) %>%
             arrange(Group)

age_plot <- ggplot(age_pdata, aes(x=AgeMin,
                                      y=SurviveL)) +
            theme_lk() +
            xlab("Age") +
            scale_x_continuous(breaks=seq(0,max(age_pdata$AgeMin),by=10)) +
            ylab("Survival Likelihood") +
            scale_y_continuous(labels = scales::percent,
                               expand = c(0.4, -0.15)) +
                  # Actual Data
                  geom_step(color=get_color(1)) +
                  geom_vline(data = age_pdata %>%
                                    filter(row_number() %in% c(3,16)),
                             aes(xintercept = AgeMin),
                             linetype = "dashed",
                             color = ltxt_color) +
                  geom_text(data = age_pdata %>%
                                   filter(row_number() %in% c(3,16)),
                            aes(x = AgeMin + 6, label = paste(AgeMin, "Years Old")),
                            y = max(age_pdata$SurviveL),
                            color = ltxt_color,
                            size = 5,
                            family = def_font)

age_plot

With the limited data set, we can see that younger individuals are more likely to survive. A possible explanation is probably because babies and toddlers occupy less space (and hence easier to find seats within the lifeboats).

However, older people, especially those more than 60 years old, are less likely to survive. This is probably due to their lack of agility in responding to the crash.

Hence, the premise is TRUE for young children and FALSE for older people.

On Company

Hypothesis 5: Individuals with children are more likely to survive, while individuals with same-age company are less likely to survive.
company_set <- training_set %>%
                select(SibSp, Parch, Survived) %>%
                group_by(SibSp, Parch) %>%
                summarise(CohortSize = n(),
                          SurvivalRate = sum(Survived)/n())

company_plot <- ggplot(company_set, aes(x = as.factor(Parch),
                                        y = as.factor(SibSp),
                                        fill = SurvivalRate,
                                        alpha = CohortSize)) +
                theme_lk() + 
                  ########### LEGENDS ###########
                  scale_fill_gradientn(name = "Survival Likelihood",
                                       colours=c(as.character(get_color("red")),
                                                 as.character(get_color("red")),
                                                 as.character(get_color("yellow")),
                                                 as.character(get_color("green")),
                                                 as.character(get_color("green"))),
                                       values = c(0.0,0.4,0.5,0.6,1.0),
                                       guide = "none") +
                  scale_alpha_continuous(limits = c(0,25),
                                         name = "Cohort Size",
                                         guide = guide_legend(
                                           nrow = 1,
                                           override.aes=list(fill=ltxt_color)
                                         )) +
                  ########### X-AXIS ###########
                  xlab("# of Different-Age Company (e.g. Children / Parents)") +
                  scale_x_discrete(expand = c(0, 0)) +
                  ########### Y-AXIS ###########
                  ylab("# of Same-Age Company (e.g. Spouse / Siblings)") +
                  ########### ELEMENTS ###########
                geom_tile(colour = bg_color, size = 7) +
                geom_text(aes(label = paste0(round(SurvivalRate*100),"%")),
                          color = "white",
                          alpha = 1,
                          size = 5,
                          family = def_font)

company_plot

As can be seen from the chart above, individuals with 1 to 2 children are more likely to survive than those without any children. It is inconclusive to determine from the chart above, whether same-age company has an impact of survivalhood. Hence, this premise is TRUE TO A LIMITED EXTENT.

On Cabin Positions

Hypothesis 6: Cabin positions should have an impact of survivalhood, but only to a certain extent.

Cabins further away from the lifeboats will have a more difficult time surviving, hence we should assume that where passengers stay will have an impact on their survivalhood. However, the cabin positions can only determine survivalhood partially, given that passengers may not be at their cabins during the time of crash.

A more detailed deck plan can be found here. As can be seen, the letter of the cabin represents the deck (floor) where the room is located. We also see that odd numbers correspond to one side of the Titanic, while even numbers correspond to the other.

We will break down the cabin position into three constituents: the cabin floor, the cabin number (odd or even), and the number of cabins specified.

getCabinFloor <- function(x) {

  if (x == "") { return("NA") }

  ltr <- unique(strsplit(gsub("[0-9]","",x)," ")[[1]])
  if (length(ltr) == 1) {
    ltr
  } else {
    "Multiple"
  }

}

getCabinNumber <- function(x) {

  if (x == "") { return("NA") }

  num <- as.integer(strsplit(gsub("[A-z]","",x)," ")[[1]]) %% 2

  if (is.nan(mean(num))) {
    return("NA")
  } else if (length(unique(num)) == 1) {
    if (unique(num) == 0) { return("Even") } else { return("Odd") }
  } else {
    return("Mixed")
  }

}

getCabinCount <- function(x) {
  if (x == "") { return("NA") }
  length(strsplit(x," ")[[1]])
}

appendCabinInfo <- function(data) {
  data %>%
  mutate(
    CabinFloor = sapply(Cabin, getCabinFloor),
    CabinNumber = sapply(Cabin, getCabinNumber),
    CabinCount = sapply(Cabin, getCabinCount)
  )
}

cabin_set <- appendCabinInfo(training_set) %>%
             select(CabinFloor,CabinNumber,CabinCount, Survived)
Cabin Floors
cabinLet_set <- cabin_set %>%
                group_by(CabinFloor) %>%
                summarise(CohortSize = n(),
                          SurvivalRate = sum(Survived)/n())

cabinLet_plot <- ggplot(cabinLet_set, aes(x=as.factor(CabinFloor),
                                          y=SurvivalRate,
                                          alpha = CohortSize,
                                          fill = SurvivalRate)) +
                  theme_lk() + 
                    ########### LEGENDS ###########
                  scale_alpha_continuous(name = "Cohort Size",
                                         limits = c(0,30),
                                         guide = guide_legend(
                                           nrow = 1,
                                           override.aes=list(fill=ltxt_color)
                                         )) +
                  scale_fill_gradientn(colours=c(as.character(get_color("red")),
                                                 as.character(get_color("red")),
                                                 as.character(get_color("yellow")),
                                                 as.character(get_color("green")),
                                                 as.character(get_color("green"))),
                                       values = c(0.0,0.4,0.5,0.6,1.0),
                                       guide = "none") +
                    ########### X-AXIS ###########
                    xlab("Cabin Floor") +
                    ########### Y-AXIS ###########
                    ylab("Survival Likelihood") +
                    scale_y_continuous(labels = scales::percent,
                                       # Make Sure Bar and X-Axis Stick Together
                                       expand = c(0,0),
                                       limits =c(0,1)) +
                    ########### ELEMENTS ###########
                  # Actual Data
                  geom_bar(stat="identity") +
                    geom_text(aes(y = SurvivalRate - 0.08,
                                  label=paste0(round(SurvivalRate*100),"%")),
                              color = "#FFFFFF", alpha = 1,
                              family=def_font,size=4)

cabinLet_plot

From the chart, we can deduce that Cabins B to E has a higher likelihood of survival compared to other/unspecified cabins, which suggests that the position of cabins play a role in determining whether a passenger survive.

Cabin Numbers
cabinNumber_set <- cabin_set %>%
                   group_by(CabinNumber) %>%
                    summarise(CohortSize = n(),
                              SurvivalRate = sum(Survived)/n())
cabinNumber_set$CabinNumber <- factor(cabinNumber_set$CabinNumber, levels=c("Even","Odd","Mixed","NA"))

cabinNumber_plot <- ggplot(cabinNumber_set, aes(x = SurvivalRate,
                                                y = 0,
                                                color=CabinNumber,
                                                size=CohortSize)) +
                    theme_lk() +
                    ########### LEGENDS ###########
                      theme(plot.margin = unit(c(0,0,0,-40),'pt')) +
                        # Legends
                        theme(
                          legend.position = c(0.5,0.2),
                          legend.box.just = c(0.5,0.5),
                          legend.justification = c(0.5,0.5)
                        ) +
                      scale_color_manual(name = "Room Type",
                                         values = get_color(),
                                         guide = guide_legend(order = 1,
                                                              direction = "vertical",
                                                              override.aes=list(size=5),
                                                              ncol= 2)) +
                      scale_size_continuous(name = "Cohort Size",
                                            range = c(5,20),
                                            guide = guide_legend(order = 2)) +
                      ########### X-AXIS ###########
                    theme(
                      axis.line.x = element_line(colour=NA),
                      axis.ticks.x = element_line(colour=NA),
                      axis.title.x = element_text(colour=NA),
                      axis.text.x = element_text(colour=NA)
                    ) +
                      # Add X Axis Line
                      geom_segment(aes(x=0.25, xend=0.8, y=0, yend=0),
                                   size = 0.5,
                                   color=ltxt_color,
                                   arrow = arrow(length = unit(10,"pt"),
                                                 type = "closed")) +
                      geom_text(label = "Survival Likelihood",
                                x = 0.8,
                                y = 0.02,
                                family = def_font,
                                color = ltxt_color,
                                size = 5,
                                hjust = 1) +
                      ########### Y-AXIS ###########
                    theme(
                      # Y-Axis
                      axis.line.y = element_line(colour=NA),
                      axis.ticks.y = element_line(colour=NA),
                      axis.title.y = element_text(colour=NA),
                      axis.text.y = element_text(colour=NA)) +
                      scale_y_continuous(limits=c(-0.12, 0.05)) +
                      ########### ELEMENTS ###########
                    geom_point(alpha=0.5) +
                      geom_text(aes(y = -0.03, label=paste0(round(SurvivalRate*100,0),"%")),
                                size=4,
                                color=txt_color,
                                family=def_font)


cabinNumber_plot

It is pretty clear that those who stay in the odd rooms are more likely to survive than those in the even rooms.

Cabins Specified
cabinCount_set <- cabin_set %>%
                  group_by(CabinCount) %>%
                  summarise(CohortSize = n(),
                            SurvivalRate = sum(Survived)/n())

cabinCount_plot <- ggplot(cabinCount_set, aes(x=as.factor(CabinCount),
                                              y=SurvivalRate,
                                              alpha = CohortSize,
                                              fill = SurvivalRate)) +
                   theme_lk() +
                    ########### LEGENDS ###########
                    scale_alpha_continuous(name = "Cohort Size",
                                           limits = c(0,50),
                                           guide = guide_legend(
                                             nrow = 1,
                                             override.aes=list(fill=ltxt_color)
                                           )) +
                    scale_fill_gradientn(colours=c(as.character(get_color("red")),
                                                   as.character(get_color("red")),
                                                   as.character(get_color("yellow")),
                                                   as.character(get_color("green")),
                                                   as.character(get_color("green"))),
                                         values = c(0.0,0.3,0.4,0.50,1.),
                                         guide = "none") +
                    ########### X-AXIS ###########
                    xlab("# Cabins Specified") +
                    ########### Y-AXIS ###########
                    ylab("Survival Likelihood") +
                    scale_y_continuous(labels = scales::percent,
                                       # Make Sure Bar and X-Axis Stick Together
                                       expand = c(0,0),
                                       limits =c(0,1)) +
                    ########### ELEMENTS ###########
                  # Actual Data
                  geom_bar(stat="identity",
                           width = 0.7) +
                  geom_text(aes(y = SurvivalRate - 0.1,
                                label=paste0("Cohort\nSize:\n",CohortSize)),
                            color = "#FFFFFF", alpha = 1,
                            family=def_font,size=4)

cabinCount_plot

Even though seems to be an inverse relationship between the survival likelihood and number of cabins specified, the sample size is too small to ensure statistical significance.

In conclusion, cabin floors and cabin numbers can determine a passenger’s survival likelihood, while there is too little data to deduce a relationship in the number of cabins specified. Hence, the hypothesis is TRUE.

On Embakartion Ports

Hypothesis 7: Port of embarkation should have no impact on survivalhood.
embark_set <- training_set %>%
              filter(Embarked != "") %>%
              group_by(Embarked) %>%
              summarise(CohortSize = n(),
                        SurvivalRate = sum(Survived)/n()) %>%
              mutate(
                Port = ifelse(Embarked == "C","Cherbough",
                              ifelse(Embarked == "Q","Queenstown","Southampton")),
                Lat = ifelse(Embarked == "C",49.645009,
                      ifelse(Embarked == "Q",51.851,50.9038684)),
                Lng = ifelse(Embarked == "C",-1.62444,
                      ifelse(Embarked == "Q",-8.2967,-1.4176118 ))
              )


# Create Color Palette
col_grad <- colorRamp(c(get_color("red"),get_color("yellow"), get_color("green")))
col_limits <- c(0.4,0.6)
colpal <- function(x) {
            col_list <- col_grad((min(max(x,col_limits[1]),col_limits[2])-col_limits[1])/(col_limits[2]-col_limits[1]))
            rgb(col_list[1],col_list[2],col_list[3], maxColorValue= 255)
          }
size_range <- c(10,15)

# Create Marker Icon
tit_icon <- awesomeIcons(
            icon = 'ship',
            iconColor = '#FFFFFF',
            library = 'fa',
            markerColor = 'gray'
          )

# Create Map
map <- leaflet(width = "100%") %>%
       addProviderTiles("CartoDB.Positron") %>%
        # Location of Titanic Crash
        addAwesomeMarkers(-50.2333324,41.7666636, icon=tit_icon,popup="Titanic Crash Site")

pwalk(
  list(embark_set$Port, embark_set$Lat, embark_set$Lng, embark_set$SurvivalRate, embark_set$CohortSize),
  function(port_name, lat, lng, srate, cohort) {
    map <<- map %>%
            addCircleMarkers(lng, lat,
                             popup=paste0(port_name, "<br>","Survival Likelihood: ",round(srate*100),"%"),
                             color = colpal(srate),
                             radius = max(min(cohort/20, size_range[2]),size_range[1]),
                             opacity = 0.8,
                             fillOpacity = 0.5)
  }
)

map


At first glance, there seems to be no plausible explanation why a passenger’s port of embarkation will have an impact on the passenger’s survival likelihood.

# Assessing the correlation between Embarkation and Passenger Class

embark_pclass <- training_set %>%
                 select(Pclass, Embarked, Survived) %>%
                 filter(Embarked != "") %>%
                 group_by(Pclass, Embarked) %>%
                 summarise(CohortSize=n(),
                           SurvivalRate=sum(Survived)/n())
embark_pclass <- embark_pclass %>%
                 left_join(embark_pclass %>% group_by(Embarked) %>% summarise(TotalSize = sum(CohortSize)), by=c("Embarked")) %>%
                 mutate(PctEmbarked = CohortSize / TotalSize) %>%
                 arrange(Pclass, desc(PctEmbarked))
embark_pclass$Embarked <- factor(embark_pclass$Embarked, levels=unique(embark_pclass$Embarked))
embark_pclass$Pclass <- factor(embark_pclass$Pclass, levels=rev(unique(embark_pclass$Pclass)))

embark_pclass_plot <- ggplot(embark_pclass, aes(x=as.factor(Embarked),
                                                y=PctEmbarked,
                                                fill = as.factor(Pclass))) +
                      theme_lk() + 
                  ########### LEGENDS ###########
                  scale_fill_manual(name = "Passenger Class",
                     labels = c("1"="1 (High-Income)","2"="2 (Medium-Income)","3"="3 (Low-Income)"),
                     values = c("1"=get_color("green"),
                                "2"=alpha(get_color("yellow"),0.2),
                                "3"=alpha(get_color("red"),0.2)),
                     guide = guide_legend(reverse = TRUE)) +
                  ########### X-AXIS ###########
                  xlab("Port of Embarkation") +
                  scale_x_discrete(labels=c("S"="Southampton",
                                            "C"="Cherbough",
                                            "Q"="Queenstown")) +
                  ########### Y-AXIS ###########
                  ylab("Survival Likelihood") +
                  scale_y_continuous(labels = scales::percent,
                                     # Make Sure Bar and X-Axis Stick Together
                                     expand = c(0,0),
                                     limits =c(0,1)) +
                  ########### FLIP TO HORIZONTAL BAR CHART ###########
                  coord_flip() +
                  ########### ELEMENTS ###########
                  # Actual Data
                  geom_bar(stat="identity") +
                  geom_text(aes(label=paste0(round(PctEmbarked*100),"%"),
                                y = PctEmbarked + 0.05),
                            data = embark_pclass %>% filter(Pclass == 1),
                            color = get_color("green"),
                            family = def_font, size = 5)

embark_pclass_plot

However, by studying the demographics of the passengers who embarked at each port, we know that a higher proportion of Cherbough are high-income. This explains the higher survival likelihood for those who embarked at Cherbough, and renders the hypothesis to be FALSE.

Preparation

Based on the above hypotheses, we will be adding the following features to predict the survival likelihood of an individual.

Feature Variable Type Hypotheses Supporting the Feature Null Handling
Pclass Continuous Hypothesis 1 -1
Fare Continuous Hypothesis 1 -1
Sex Categorical Hypothesis 3 ‘Unknown’ Category Variable
Title Categorical Hypothesis 3 ‘Unknown’ Category Variable
Age Continuous Hypothesis 4 Regression
SibSp Continuous Hypothesis 5 -1
Parch Continuous Hypothesis 5 -1
CabinFloor Categorical Hypothesis 6 ‘Unknown’ Category Variable
CabinNumber Categorical Hypothesis 6 ‘Unknown’ Category Variable
Port of Embarkation Categorical Hypothesis 7 ‘Unknown’ Category Variable

Below is an example of the cleansed data set for model ingestion.

### Create Age as Regression Parameters
age_data <- training_set %>%
            appendTitle() %>%
            filter(Age > 0)
age_m <- lm(Age ~ Title + SibSp + Parch + factor(Pclass), data = age_data)

clean_data <- function(data,is_training = TRUE) {
  output <- appendTitle(data) %>%
            appendCabinInfo()

  # Populate Age
  output <- output %>%
            mutate(AgePredict = predict.lm(age_m, output),
                   Age = ifelse(is.na(Age),AgePredict,Age))

  # Create Null Handler
  null_handler <- function(is_categorical = TRUE) {
    paste0('ifelse(is.na(x) | x %in% c("","NA"),', ifelse(is_categorical,'"Unknown"','-1'),',x)')

  }

  # Specify Features
  features <- c("Pclass"=FALSE,
                "Fare"=FALSE,
                "Sex"=TRUE,
                "Title"=TRUE,
                "Age"=FALSE,
                "SibSp"=FALSE,
                "Parch"=FALSE,
                "CabinFloor"=TRUE,
                "CabinNumber"=TRUE,
                "Embarked"=TRUE)
  mutate_fn <- sapply(1:length(features), function(x) {
                paste0("sapply(",names(features[x]),", function(x) { ",null_handler(features[x])," })")
              })

  output <- output %>%
            mutate_(
              .dots= setNames(mutate_fn, names(features))
            )

  # Specify factors for different features
  output$Sex <- factor(output$Sex, levels = c("male","female","Unknown"))
  output$Title <- factor(output$Title, levels = c("Mr","Mrs","Miss","Master","Rare Title / M","Rare Title / F"))
  output$CabinFloor <- factor(output$CabinFloor, levels = c("A","B","C","D","E","F","G","T","Multiple","Unknown"))
  output$CabinNumber <- factor(output$CabinNumber, levels = c("Odd","Even","Mixed","Unknown"))
  output$Embarked <- factor(output$Embarked, levels = c("S","C","Q","Unknown"))
  if (is_training) {
    output$Survived <- factor(output$Survived, levels = c(0,1))
  }

  # Specify columns to extract
  if(is_training) {
    columns <- c("Survived"=TRUE,features)
  } else {
    columns <- features
  }

  output %>% select_(.dots=names(columns))
}

cleaned_set <- clean_data(training_set)

knitr::kable(head(cleaned_set),format="html")
Survived Pclass Fare Sex Title Age SibSp Parch CabinFloor CabinNumber Embarked
0 3 7.2500 male Mr 22.0000 1 0 Unknown Unknown S
1 1 71.2833 female Mrs 38.0000 1 0 C Odd C
1 3 7.9250 female Miss 26.0000 0 0 Unknown Unknown S
1 1 53.1000 female Mrs 35.0000 1 0 C Odd S
0 3 8.0500 male Mr 35.0000 0 0 Unknown Unknown S
0 3 8.4583 male Mr 29.8705 0 0 Unknown Unknown Q

The Random Forest Model

Listed below are the preliminary results from the initial run using the Random Forest model.

set.seed(1)
rf_model <- randomForest(Survived ~ .,
                         data = cleaned_set,
                         importance=TRUE,
                         ntree=2000)

rf_model
## 
## Call:
##  randomForest(formula = Survived ~ ., data = cleaned_set, importance = TRUE,      ntree = 2000) 
##                Type of random forest: classification
##                      Number of trees: 2000
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 16.84%
## Confusion matrix:
##     0   1 class.error
## 0 492  57   0.1038251
## 1  93 249   0.2719298
importance(rf_model)
##                    0          1 MeanDecreaseAccuracy MeanDecreaseGini
## Pclass      47.21663 47.4942132             69.39251         28.07819
## Fare        43.19055 43.9893907             66.67144         58.87971
## Sex         42.24730 36.8114966             42.57382         50.52881
## Title       54.59914 57.1290385             60.72891         77.20460
## Age         34.58641 39.4252418             53.95813         52.61366
## SibSp       44.09189  0.8302836             42.35741         18.13917
## Parch       11.04993  9.5437824             15.49590         10.00568
## CabinFloor  24.50409  6.9765507             27.03299         21.64158
## CabinNumber 17.25665 14.8254202             23.33504         12.55936
## Embarked    17.95344 29.6962125             35.40686         10.48846
p_y <- predict(rf_model, newdata = cleaned_set %>% select(-Survived))
err_out <-  cleaned_set %>%
            mutate(p_y = p_y) %>%
            filter(Survived != p_y)
  
knitr::kable(head(err_out),format="html")
Survived Pclass Fare Sex Title Age SibSp Parch CabinFloor CabinNumber Embarked p_y
1 2 13.0000 male Mr 32.99135 0 0 Unknown Unknown S 0
1 3 31.3875 female Mrs 38.00000 1 5 Unknown Unknown S 0
1 3 7.2292 male Mr 29.87050 0 0 Unknown Unknown C 0
0 2 21.0000 female Mrs 27.00000 1 0 Unknown Unknown S 1
1 1 35.5000 male Mr 41.47767 0 0 C Even S 0
1 3 9.5000 male Mr 29.00000 0 0 Unknown Unknown S 0

Predictive Results